Picking the file manually with file.choose
surveys <- read.csv( file.choose() )
Using a path (in a string).
surveys <-
read.csv('portal_data_joined.csv')
dpylrtildrggplot# Only need to do this once
# Time consuming!
install.packages("tidyverse")
# This loads all of the dpylr functions
library("dplyr")
# Syntax: select(df, col1, col2, ...)
new_df <- select(surveys, plot_id, species_id, weight)
# Good habit: Always inspect the result with head
head(new_df)
plot_id species_id weight
1 2 NL NA
2 2 NL NA
3 2 NL NA
4 2 NL NA
5 2 NL NA
6 2 NL NA
new_df2 <- filter(surveys, year == 1995)
head(new_df2)
record_id month day year plot_id species_id sex hindfoot_length weight
1 22314 6 7 1995 2 NL M 34 NA
2 22728 9 23 1995 2 NL F 32 165
3 22899 10 28 1995 2 NL F 32 171
4 23032 12 2 1995 2 NL F 33 NA
5 22003 1 11 1995 2 DM M 37 41
6 22042 2 4 1995 2 DM F 36 45
genus species taxa plot_type
1 Neotoma albigula Rodent Control
2 Neotoma albigula Rodent Control
3 Neotoma albigula Rodent Control
4 Neotoma albigula Rodent Control
5 Dipodomys merriami Rodent Control
6 Dipodomys merriami Rodent Control
# Why are there columns not select (last slide) still here?
new_df <- select(surveys, plot_id, species_id, weight, year)
new_df2 <- filter(new_df, year == 1995)
new_df3 <- mutate(new_df2, weight_kg = weight / 1000)
head(new_df3)
plot_id species_id weight year weight_kg
1 2 NL NA 1995 NA
2 2 NL 165 1995 0.165
3 2 NL 171 1995 0.171
4 2 NL NA 1995 NA
5 2 DM 41 1995 0.041
6 2 DM 45 1995 0.045
# What is wrong with this approach?
surveys <- select(surveys, plot_id, species_id, weight, year)
surveys <- filter(surveys, year == 1995)
surveys <- mutate(surveys, weight_kg = weight / 1000)
head(surveys)
surveys %>%
select( plot_id, species_id, weight, year) %>%
filter( year == 1995) %>%
mutate( weight_kg = weight / 1000) %>%
head( )
plot_id species_id weight year weight_kg
1 2 NL NA 1995 NA
2 2 NL 165 1995 0.165
3 2 NL 171 1995 0.171
4 2 NL NA 1995 NA
5 2 DM 41 1995 0.041
6 2 DM 45 1995 0.045
surveys %>%
select(#surveys,
plot_id, species_id, weight, year) %>%
filter(#df_sel,
year == 1995) %>%
mutate(#df_sel_filt,
weight_kg = weight / 1000) %>%
head(#df_sel_filt_mutate
)
plot_id species_id weight year weight_kg
1 2 NL NA 1995 NA
2 2 NL 165 1995 0.165
3 2 NL 171 1995 0.171
4 2 NL NA 1995 NA
5 2 DM 41 1995 0.041
6 2 DM 45 1995 0.045
surveys %>%
select(plot_id, species_id, weight, year) %>%
filter(year == 1995) %>%
mutate(weight_kg = weight / 1000) %>%
head()
plot_id species_id weight year weight_kg
1 2 NL NA 1995 NA
2 2 NL 165 1995 0.165
3 2 NL 171 1995 0.171
4 2 NL NA 1995 NA
5 2 DM 41 1995 0.041
6 2 DM 45 1995 0.045
surveys_small <- surveys %>%
filter(weight < 5) %>%
select(species_id, sex, weight)
head(surveys_small)
species_id sex weight
1 PF F 4
2 PF F 4
3 PF M 4
4 RM F 4
5 RM M 4
6 PF 4
Create a new data frame from the surveys data that meets the following criteria:
species_id column and hindfoot_half
hindfoot_length values. hindfoot_half column has
NAs and Hint: think about how the commands should be ordered to produce this data frame!
library(dplyr)
pi %>%
round(2) %>%
as.character
[1] "3.14"
pi %>%
round(2) %>%
as.character
[1] "3.14"
x <- pi
r_x <-
round(x, 2)
c_x <-
as.character(r_x)
pi %>%
round(2) %>%
as.character
[1] "3.14"
as.character(
round(
pi, 2))
[1] "3.14"
pi %>%
round(2) %>%
as.character
[1] "3.14"
surveys %>%
filter(weight < 5) %>%
select(species_id, sex, weight)
species_id sex weight
1 PF F 4
2 PF F 4
3 PF M 4
4 RM F 4
5 RM M 4
6 PF 4
7 PP M 4
8 RM M 4
9 RM M 4
10 RM M 4
11 PF M 4
12 PF F 4
13 RM M 4
14 RM M 4
15 RM F 4
16 RM M 4
17 RM M 4
surveys %>%
filter(weight < 5) %>%
select(species_id, sex, weight)
species_id sex weight
1 PF F 4
2 PF F 4
3 PF M 4
4 RM F 4
5 RM M 4
6 PF 4
7 PP M 4
8 RM M 4
9 RM M 4
10 RM M 4
11 PF M 4
12 PF F 4
13 RM M 4
14 RM M 4
15 RM F 4
16 RM M 4
17 RM M 4
# Find the name errors
sales %>%
select(salesperson, sedan)
# Find the syntax errors
sales %>%
group_by(Salesperson) %>%
mutate(avg_sedan = mean(Sedan),
avg_SUV = mean(SUV)
avg_truck = mean(Truck)
# Find the semantic errors
sales %>%
group_by(Salesperson) %>%
mutate(avg_sedan = median(Truck))
JMP
dplyr
dplyrsurveys %>%
summarise(
avg_wgt = mean(weight,
na.rm = TRUE))
avg_wgt
1 42.67243
grp_agg <-
surveys %>%
group_by(taxa) %>%
summarise(
avg = mean(weight,
na.rm = TRUE))
grp_agg
# A tibble: 4 x 2
taxa avg
<fctr> <dbl>
1 Bird NaN
2 Rabbit NaN
3 Reptile NaN
4 Rodent 42.67243
grp_agg <-
surveys %>%
filter(!is.na(weight)) %>%
group_by(taxa) %>%
summarise(
avg = mean(weight))
grp_agg
# A tibble: 1 x 2
taxa avg
<fctr> <dbl>
1 Rodent 42.67243
grp_agg_2 <-
surveys %>%
group_by(taxa, sex) %>%
summarise(
cnt = n())
grp_agg_2
# A tibble: 6 x 3
# Groups: taxa [?]
taxa sex cnt
<fctr> <fctr> <int>
1 Bird 450
2 Rabbit 75
3 Reptile 14
4 Rodent 1209
5 Rodent F 15690
6 Rodent M 17348
# Explore the output
surveys %>%
group_by(taxa)
sales <-
read.csv("auto_sales.csv")
sales
Salesperson Compact Sedan SUV Truck
1 Ann 22 18 15 12
2 Bob 19 12 17 20
3 Yolanda 19 8 32 15
4 Xerxes 12 23 18 9
library(tidyr)gatherspread sales %>%
mutate(car_sales = Compact + Sedan)
Salesperson Compact Sedan SUV Truck car_sales
1 Ann 22 18 15 12 40
2 Bob 19 12 17 20 31
3 Yolanda 19 8 32 15 27
4 Xerxes 12 23 18 9 35
sales %>%
select(Salesperson,
Compact,
Sedan)
Salesperson Compact Sedan
1 Ann 22 18
2 Bob 19 12
3 Yolanda 19 8
4 Xerxes 12 23
Arguments are:
library(tidyr)
stacked_sales <-
sales %>%
gather("auto_type",
"num_sales",
Compact,
Sedan,
SUV,
Truck)
head(stacked_sales)
Salesperson auto_type num_sales
1 Ann Compact 22
2 Bob Compact 19
3 Yolanda Compact 19
4 Xerxes Compact 12
5 Ann Sedan 18
6 Bob Sedan 12
library(tidyr)
stacked_sales <-
sales %>%
gather("auto_type",
"num_sales",
-Salesperson) #Everything but
head(stacked_sales)
Salesperson auto_type num_sales
1 Ann Compact 22
2 Bob Compact 19
3 Yolanda Compact 19
4 Xerxes Compact 12
5 Ann Sedan 18
6 Bob Sedan 12
stacked_sales %>%
filter(num_sales > 20)
Salesperson auto_type num_sales
1 Ann Compact 22
2 Xerxes Sedan 23
3 Yolanda SUV 32
stacked_sales <-
stacked_sales %>%
mutate(car_type =
recode(auto_type,
Compact = "Car",
Sedan = "Car",
SUV = "Utility",
Truck = "Utility"))
car_sales <-
stacked_sales %>%
group_by(Salesperson, car_type) %>%
summarise(car_sales = sum(num_sales)) %>%
as.data.frame
car_sales
Salesperson car_type car_sales
1 Ann Car 40
2 Ann Utility 27
3 Bob Car 31
4 Bob Utility 37
5 Xerxes Car 35
6 Xerxes Utility 27
7 Yolanda Car 27
8 Yolanda Utility 47
Arguments are:
car_sales_unstacked <-
car_sales %>%
spread(car_type, car_sales) %>%
as.data.frame
car_sales_unstacked
Salesperson Car Utility
1 Ann 40 27
2 Bob 31 37
3 Xerxes 35 27
4 Yolanda 27 47